package cn.lcfms.test;


import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

import cn.lcfms.app.admin.bean.ArticleBean;
import cn.lcfms.bin.App;
import cn.lcfms.bin.BaseService;
import cn.lcfms.utils.HttpUtils;
import cn.lcfms.utils.TimeUtils;

@RunWith(SpringJUnit4ClassRunner.class)    
@ContextConfiguration(locations = {"classpath*:springmvc-context.xml"})    
public class Pa{
	
	@Test
	public void test1() {				
		for(int i=1291277;i<1343386;i++) {
			BaseService service = App.getService("articles");
			System.out.println("当前进度："+i);
			try {
				ArticleBean bean=new ArticleBean();
				bean.setSid(i);
				String str = HttpUtils.browser("http://cancer.cmt.com.cn/detail/"+i+".html");
				String title=title(str);
				if(title.indexOf("瘤")==-1 && title.indexOf("癌")==-1) {					
					continue;
				}
				System.out.println(title);
				String desc=desc(str);
				String content=content(str);
				String labels=labels(title);
				bean.setTitle(title);
				bean.setDesc(desc);
				bean.setLabels(labels);
				bean.setDetails(content);
				bean.setCreated_at(TimeUtils.getCurrentDateTime());
				if(labels.equals("知识")) {
					bean.setCategory_id(25);
				}else {
					bean.setCategory_id(26);
				}
				bean.setMid(25);
				service.insert(bean);
			} catch (Exception e) {		
				System.out.println(i+"不存在");
				continue;
			}
		}	
	}
	
	private String content(String str) {		
		String preg="<div class=\"y-xin-cot\">.*?</div>";
		Pattern pattern=Pattern.compile(preg,Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(str);
		while(matcher.find()) {
			String gs = matcher.group(0);			
			gs=gs.substring(23,gs.length()-6);
			gs=gs.replaceAll("<a href=\".*?>", "<a href=\"/\">");
			return gs.trim();
		}
		return null;
	}
	
	private String labels(String str) {
			if(
				str.indexOf("研究")!=-1||
				str.indexOf("进展")!=-1||
				str.indexOf("2020")!=-1||
				str.indexOf("2019")!=-1||
				str.indexOf("2018")!=-1||
				str.indexOf("2017")!=-1||
				str.indexOf("2016")!=-1||
				str.indexOf("2015")!=-1||
				str.indexOf("突破")!=-1
				) {
				return "新闻";
			}
			if(
				str.indexOf("大会")!=-1||
				str.indexOf("论坛")!=-1||
				str.indexOf("年会")!=-1||
				str.indexOf("届")!=-1||
				str.indexOf("研讨会")!=-1||
				str.indexOf("峰会")!=-1||
				str.indexOf("会议")!=-1
				) {
				return "媒体";
			}			
			return "知识";
	}

	private String title(String str) {
		String preg="<h2 class=\"y-title\">.+?</h2>";
		Pattern pattern=Pattern.compile(preg);
		Matcher matcher = pattern.matcher(str);
		while(matcher.find()) {
			String gs = matcher.group(0);
			gs=gs.substring(20, gs.length()-5);
			return gs.trim();
		}
		return null;
	}
	
	private String desc(String str) {	
		String preg="<div class=\"y-daf\">导读</div>.*?<p>.*?</p>";
		Pattern pattern=Pattern.compile(preg,Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(str);
		while(matcher.find()) {
			String gs = matcher.group(0);
			gs=gs.substring(gs.indexOf("<p>")+3, gs.length()-5).trim();
			return gs.replace("&nbsp;", "");
		}
		return null;
	}
}
